Similarity search on Stack Overflow posts

Stack Overflow posts


In [ ]:
%%sql -d standard
SELECT 
  id, title, body, view_count, tags
FROM
  `gcp-samples2.stackoverflow_demo.top100K_posts`
ORDER BY
  view_count DESC
LIMIT
  5

Segmentation


In [ ]:
%%sql -d standard
CREATE TEMPORARY FUNCTION segmentation(body STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """

// remove HTML tags, convert to lowercase and extract words
return body.replace(/(<[^>]+>|&#[^;]+;)/g, '').toLowerCase().match(/\\w\\w+/g);

""";

SELECT
  id, 
  segmentation(CONCAT(title, body, tags)) as words
FROM
  `gcp-samples2.stackoverflow_demo.top100K_posts`
LIMIT
  5

# (for full query on 10M posts takes about 30 secs with 14.1GB)

Calculate Feature Vectors (TF-IDF) for each post


In [ ]:
%%sql -d standard
CREATE TEMPORARY FUNCTION calc_tf_idf(words ARRAY<STRING>)
RETURNS STRING
LANGUAGE js AS """

// count each word in this post
var word_count = new Object();
for (word of words) {
  if (word_count[word]) {
    word_count[word]++;
  } else {
    word_count[word] = 1;
  }
}

// calculate TF-IDF values for each word
// tf = word count / total word count
// idf = log (100K posts / # of posts with the word)
// tf_idf = tf * idf
var total_posts = 100000;
var words_in_post = words.length;
var tf_idf = new Object();
var norm_sum = 0;
for (word in word_count) {
  if (word_dict[word] == null) {
    word_dict[word] = 1;
  }
  var tf = (word_count[word] / words_in_post);
  var idf = Math.log(total_posts / word_dict[word]);
  tf_idf[word] = tf * idf;
  norm_sum += tf_idf[word]^2;
}

// normarizing TF-IDF values with L2 norm
for (word in tf_idf) {
  tf_idf[word] = (tf_idf[word] / Math.sqrt(norm_sum)).toFixed(5);
  if (tf_idf[word] < 0.01) {
    delete tf_idf[word]; // remove trivial words
  }
}

return JSON.stringify(tf_idf);

"""
OPTIONS (
  library="gs://gcp-samples2-stackoverflow/word_dict_0.js",
  library="gs://gcp-samples2-stackoverflow/word_dict_1.js",
  library="gs://gcp-samples2-stackoverflow/word_dict_2.js",
  library="gs://gcp-samples2-stackoverflow/word_dict_3.js",
  library="gs://gcp-samples2-stackoverflow/word_dict_4.js",
  library="gs://gcp-samples2-stackoverflow/word_dict_5.js",
  library="gs://gcp-samples2-stackoverflow/word_dict_6.js"
);

SELECT
  id, calc_tf_idf(words) AS tf_idf
FROM
  `gcp-samples2.stackoverflow_demo.top100K_posts_segmented` AS posts
LIMIT
  10

# (for full query on 10M posts takes 50 secs with 12.2GB)

Similarity Search with Feature Vectors


In [ ]:
%%sql -d standard
CREATE TEMPORARY FUNCTION calc_similarity(tf_idf_json_0 STRING, tf_idf_json_1 STRING)
RETURNS FLOAT64
LANGUAGE js AS """

// parse JSON to extract tf_idf
var tf_idf_0 = JSON.parse(tf_idf_json_0);
var tf_idf_1 = JSON.parse(tf_idf_json_1);

// calculate cosine similarity
var similarity = 0;
for (word in tf_idf_0) {
  var t0 = tf_idf_0[word] ? Number(tf_idf_0[word]) : 0;
  var t1 = tf_idf_1[word] ? Number(tf_idf_1[word]) : 0;
  similarity += t0 * t1;
}

return similarity;
""";

SELECT
  title,
  body,
  tags,
  similarity
FROM
  (
    SELECT
      t1.id, 
      calc_similarity(tf_idf_0, t1.tf_idf) AS similarity
    FROM
      (
        SELECT tf_idf AS tf_idf_0
        FROM `gcp-samples2.stackoverflow_demo.top100K_posts_tf_idf` AS t0
        WHERE id = 92082 
      )
    CROSS JOIN
      `gcp-samples2.stackoverflow_demo.top100K_posts_tf_idf` AS t1
    ORDER BY
      similarity DESC
    LIMIT
      10
  )
JOIN
  `gcp-samples2.stackoverflow_demo.top100K_posts` AS t2
USING (id)  
ORDER BY
  similarity DESC

# (for full query on 10M posts takes about 16 secs with 16 GB)

# try other posts : 
# 5585779 (String to Int in Java)
# 1789945 (substring in JS)
# 92082 (add a column in SQL Server)
# 503093 (redirecting in jQuery)

In [ ]: